import pandas as pd #Library for Processing; Organizing; and cleaning the data
import matplotlib.pyplot as plt #Library for Grapical Analysis
from matplotlib.pyplot import rcParams
import numpy as np #Library for solving mathematical models
import random
import tensorflow as tf #Library for Neural Networks and Deep Learning
from tensorflow import keras
from datetime import datetime
import datetime as dt
import collections #Library for collection of data(Tuples)
from collections import Counter
import os
import seaborn as sns #Library for Data Visualization
from sklearn.model_selection import train_test_split #Library for splitting the dataset
from sklearn import linear_model as lm
from sklearn.linear_model import LinearRegression #Library for the Linear model and its Regression
from sklearn import metrics
import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
%matplotlib inline
#Importing the data
darksky = pd.read_csv(r'C:\Users\Kowshik Kumar B\Documents\HU\500\Project\darksky.csv')
#Accessing the first few rows of the "darksky" dataset.
darksky.head()
#Accessing the last few rows of the "darksky" dataset.
darksky.tail()
#Checking total number of NA values in each variable of the "darksky" dataset.
darksky.isna().sum()
#Checking for the dimensions of the "darksky" dataset.
darksky.shape
#Checking for datatypes of the variables in "darksky" dataset.
darksky.dtypes
#Checking for Summary table of the "darksky" dataset.
darksky.describe().T
#Correlation Matrix of the "darksky" dataset.
f,ax=plt.subplots(figsize=(15,15))
sns.heatmap(darksky.corr(), annot= True,linewidths=0.4,fmt='.1f',ax=ax)
plt.show()
#Histograms of the "darksky" dataset.
rcParams['figure.figsize']=15,15
darksky.hist()
#Frequency plot of the categorical variable of "darksky" dataset.
plt.figure(figsize=(10,8))
sns.countplot(x=darksky['Summary(0=Sunny, 1=Rain, 2=Snow)'])
plt.title("Count of number of days on their weather type condition")
plt.xlabel("Weather type[0=Sunny,1=Rain,2=Snow]")
plt.ylabel("Count of days")
#Paiplots of the "darksky" dataset.
sns.pairplot(darksky)
sns.jointplot(x="apparentTemperature",y="dewPoint",data=darksky,kind="reg",height=8, color='b')
plt.title("Scatterplot between Apparent Temperature and Dew Point.")
plt.xlabel("Apparent Temperature")
plt.ylabel("Dew Point")
sns.jointplot(x="temperature",y="dewPoint",data=darksky,kind="reg",height=8, color='g')
plt.title("Scatterplot between Temperature and Dew Point.")
plt.xlabel("Temperature")
plt.ylabel("Dew Point")
sns.jointplot(x="apparentTemperature",y="temperature",data=darksky,kind="reg",height=8, color='grey')
plt.xlabel("Apparent Temperature")
plt.ylabel("Temperature")
plt.figure(figsize=(10,8))
sns.distplot(a=darksky['apparentTemperature'],bins=10,hist=True)
plt.title("Histogram and Density plot for Apparent Temperature")
plt.xlabel("Apparent Temperature value")
plt.ylabel("value")
plt.figure(figsize=(10,8))
sns.distplot(a=darksky['temperature'],bins=10,hist=True)
plt.title("Histogram and Density plot for Temperature")
plt.xlabel("Temperature value")
plt.ylabel("value")
plt.figure(figsize=(10,8))
sns.distplot(a=darksky['dewPoint'],bins=10,hist=True)
plt.title("Histogram and Density plot for Dew Point")
plt.xlabel("Dew Point value")
plt.ylabel("value")
plt.figure(figsize=(10,8))
sns.distplot(a=darksky['humidity'],bins=10,hist=True)
plt.title("Histogram and Density plot for Humidity")
plt.xlabel("Humidity value")
plt.ylabel("value")
plt.figure(figsize=(10,8))
sns.boxplot(x="Summary(0=Sunny, 1=Rain, 2=Snow)",y="apparentTemperature",data=darksky)
#sns.swarmplot(x="Summary(0=Sunny, 1=Rain, 2=Snow)",y="apparentTemperature",data=darksky)
plt.title("Box plot for Weather Condition and Apparent Temperature")
plt.xlabel("Weather Condition(0=Sunny, 1=Rain, 2=Snow)")
plt.ylabel("Apparent Temperature")
plt.figure(figsize=(10,8))
sns.boxplot(x="Summary(0=Sunny, 1=Rain, 2=Snow)",y="humidity",data=darksky)
#sns.swarmplot(x="Summary(0=Sunny, 1=Rain, 2=Snow)",y="humidity",data=darksky)
plt.title("Box plot for Weather Condition and Humidity")
plt.xlabel("Weather Condition(0=Sunny, 1=Rain, 2=Snow)")
plt.ylabel("Humidity")
plt.figure(figsize=(10,8))
sns.boxplot(x="Summary(0=Sunny, 1=Rain, 2=Snow)",y="precipIntensity",data=darksky)
#sns.swarmplot(x="Summary(0=Sunny, 1=Rain, 2=Snow)",y="precipIntensity",data=darksky)
plt.title("Box plot for Weather Condition and Precipitaion Intensity")
plt.xlabel("Weather Condition(0=Sunny, 1=Rain, 2=Snow)")
plt.ylabel("Precipitation Intensity")
#Formatting the 'time' variable into datetime formats.
pd.to_datetime(darksky['time'])
#Tidying the dataset
darksky['temperature']=darksky['temperature'].fillna(darksky['temperature'].mean())
darksky['apparentTemperature']=darksky['apparentTemperature'].fillna(darksky['apparentTemperature'].mean())
darksky['ozone']=darksky['ozone'].fillna(darksky['ozone'].mean())
darksky['windGust']=darksky['windGust'].fillna(darksky['windGust'].mean())
darksky['precipIntensity']=darksky['precipIntensity'].fillna(0)
darksky['precipProbability']=darksky['precipProbability'].fillna(0)
darksky['humidity']=darksky['humidity'].fillna(darksky['humidity'].mean())
darksky['pressure']=darksky['pressure'].fillna(darksky['pressure'].mean())
darksky['windSpeed']=darksky['windSpeed'].fillna(darksky['windSpeed'].mean())
darksky['windBearing']=darksky['windBearing'].fillna(darksky['windBearing'].mean())
darksky['cloudCover']=darksky['cloudCover'].fillna(darksky['cloudCover'].mean())
darksky['visibility']=darksky['visibility'].fillna(darksky['visibility'].mean())
darksky['dewPoint']=darksky['dewPoint'].fillna(darksky['dewPoint'].mean())
darksky['uvIndex']=darksky['uvIndex'].fillna(method='ffill')
#Accessing the first few rows of the "darksky" dataset after tidying.
darksky.head()
#Checking for total number of NA values in each variable of the "darksky" dataset after tidying.
darksky.isna().sum()
#Initializing the values of the Dependent and Predictor vairables.
#NOTE#Only Significant Variables are kept in the model and rest were eliminated by using Backward Propogation Elimination Method.
X = darksky[['precipIntensity','humidity','pressure','windSpeed','windBearing','cloudCover','visibility','ozone']].values
y = darksky['temperature'].values
#Splitting the dataset into Train(80%) and Test(20%).
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#Performing the Multiple Linear Regression.
model = LinearRegression()
model.fit(X_train, y_train)
#Checking for the Analysis of Variance Summary of the final model.
X_OLS=sm.add_constant(X_train)
sm.OLS(y_train,X_OLS).fit().summary()
#Checking for the Coefficient values of the Linear Regression Equation.
coefficient_df = pd.DataFrame(model.coef_,['precipIntensity', 'humidity' ,'pressure','windSpeed','windBearing', 'cloudCover','visibility','ozone'], columns=['Coefficient'])
coefficient_df
#Checking for the Intercept value of the Linear Regression Equation.
model.intercept_
y_pred = model.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1 = df.head(10)
df1
df1.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
metrics.mean_absolute_error(y_test, y_pred) #MAE
metrics.mean_squared_error(y_test, y_pred) #MSE
np.sqrt(metrics.mean_squared_error(y_test, y_pred)) #RSME